1… 10
11.a A movie should appear in the dataset at least 18 times. Each has a record for the weekend (Friday, Saturday and Sunday) from the opening weekend to at least 6 weekends later (for the ones kept). The ones dropped were not in theaters for more than 6 weekends.
11.b
#keeping films that aren't dropped
films_used <- films |>
filter(dropped != 1)
11.c
# day when 12 Rounds came in
round_12_date <- as.Date("2009-03-27")
# Define the number of days to add
days_before <- 17984 #number under 12 Rounds "date" column
# Days prior to the
reference_date <- round_12_date - days_before + 1
# Print the new date
print(reference_date)
## [1] "1960-01-01"
11.d
films_used_d <- films_used |>
mutate(movie_date = as.Date(reference_date + date)) |>
#putting the release_date in the 4th column
select(title, production_budget, release_yr,
movie_date, sat_date, everything())
films_used_d[, c("title", "movie_date")]
11.e
#first using sat_date to get the date for each saturday
films_used_date <- films_used_d |>
#getting the day for saturday
mutate(sat_day = reference_date + sat_date) |>
mutate(sat_day_of_week = wday(sat_day, label = TRUE)) |>
mutate(
fri_dummy = ifelse(movie_date == sat_day - 1, 1, 0),
sat_dummy = ifelse(movie_date == sat_day , 1, 0),
#reasoning... there was no movie released on Sunday....
sun_dummy = ifelse(movie_date == sat_day + 1, 1, 0)
) |> arrange(title)
films_used_date[, c("title", "movie_date","sat_day" ,"fri_dummy", "sat_dummy", "sun_dummy")]
11.f
#creating dummies for week using fastDummies
films_used_date <- films_used_date |>
arrange(title, sat_day) |>
group_by(title) |>
# Assign numeric labels to unique elements of sat_date within each title
mutate(week = as.integer(factor(sat_date)))
#Now using fast dummies...
films_used_date <- dummy_cols(films_used_date, select_columns = 'week')
films_used_date[, c("title", "movie_date" ,"week_1", "week_2", "week_3")]
11.g
#using the "Fast Dummies" library... to automatically create dummies for year
film <- dummy_cols(films_used_date, select_columns = 'release_yr')
film[, c("title", "release_yr", "release_yr_2009", "release_yr_2010")]
11.h
#combine the weekends
temp <- film |>
mutate(weekend = case_when(
sat_dummy == 1 ~ "Saturday",
fri_dummy == 1 ~ "Friday",
sun_dummy == 1 ~ "Sunday",
)) |>
group_by(week, weekend) |>
summarize(mean = mean(tickets, na.rm = TRUE))
temp |>
ggplot(aes(x = week, y = mean, color = as.factor(weekend))) +
geom_point() +
geom_line() +
scale_color_manual(values = c("Saturday" = "#4682B4",
"Friday" = "red",
"Sunday" = "#8B008B")) +
labs(color = "Weekend",
y = "Tickets",
x = "Week") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 6)) + # Set x-axis ticks
scale_y_continuous(breaks = scales::pretty_breaks(n = 6)) + # Set y-axis ticks
theme_bw()
NOT NEEDED
#subset colnames that have the hh in them
holiday <- str_subset(colnames(film), "hh")
#make the things in holiday "add"
holiday_dummy <- str_c(holiday, collapse = " + ")
#day of the week dummies
weekend_dummy <- str_c(str_subset(colnames(film), "dummy"), collapse = " + ")
#week of the year dummies
week_dummy <- str_c(str_subset(colnames(film), "week_"), collapse = " + ")
#year of the week dummy
year_dummy <- str_c(str_subset(colnames(film), "release_yr_"), collapse = " + ")
#combine
mod1 <- glue("tickets ~ {weekend_dummy} + {week_dummy} + {year_dummy} + {holiday_dummy}")
#fit a regression model
reg_mod1 <- lm(as.formula(mod1), data = film)
film <- film |>
mutate(pred_tickets = predict(reg_mod1, film)) |>
mutate(abnormal_viewership = tickets - pred_tickets)
film[, c("tickets","pred_tickets", "abnormal_viewership", "sat_day")]
weather <- read_dta("data/weather_collapsed_day.dta")
#adding www to the column names
original_cols <- colnames(weather)
# adding prefix using the paste
colnames(weather) <- paste("www", original_cols, sep = "_")
weather
weather_film <- film |>
left_join(weather,
#combine on dates, automatically filters out dates that don't match
by = c("movie_date" = "www_date",
"sat_day" = "www_sat_date"))
weather_film |>
select(movie_date, sat_day, contains("www"))
# Select columns with names containing "www_"
www_columns <- str_subset(colnames(weather_film), "www_")
# Create a copy of the original dataframe
df <- weather_film
# Define regression formula with dummy variables
regressors <- glue("~ {weekend_dummy} + {week_dummy} + {year_dummy} + {holiday_dummy}")
# Iterate over columns with names containing "www_"
for (columns in www_columns) {
# Construct regression formula
model <- paste(columns, regressors)
# Generate names for predicted values and residuals
pred_name <- paste("pred", columns, sep = "_")
resid_name <- paste("abnormal", columns, sep = "_")
# Add predicted values and residuals to the dataframe
df <- df |>
mutate(!!pred_name := predict(lm(as.formula(model), data = df), df)) |>
#residuals = column - predicted_value_for_column
mutate(!!resid_name := eval(parse(text = columns)) - eval(parse(text = pred_name)))
}
#remove the predicted and original values, keeping only the residuals
new_weather <- df |>
select(-c(contains("pred_www"), starts_with("www")))
#combine
#fit a regression model
week_2_data <- new_weather |>
filter(week_2 == 1)
#using the same regression
reg_mod2 <- lm(as.formula(mod1), data = week_2_data)
new_weather_film_wk2 <- week_2_data |>
mutate(pred_tickets_wk_2 = predict(reg_mod2, week_2_data)) |>
mutate(abnormal_viewership_wk_2 = tickets - pred_tickets_wk_2)
new_weather_film_wk2[, c("tickets", "pred_tickets_wk_2", "week_2", "abnormal_viewership_wk_2")]
#Mak
#subsetting the data to just be week 1
week_1_data <- new_weather |>
filter(week_1 == 1)
#creating the "abnormal viewerships in week 1"------------
mod1 <- glue("tickets ~ {weekend_dummy} + {week_dummy} + {year_dummy} + {holiday_dummy}")
#fit a regression model
reg_mod1 <- lm(as.formula(mod1), data = week_1_data)
new_weather_film_wk1 <- week_1_data |>
mutate(pred_tickets_wk_1 = predict(reg_mod1, week_1_data)) |>
mutate(abnormal_viewership_wk1 = tickets - pred_tickets_wk_1)
17.a OLS;
abnormal_weather_wk1_names <-
str_subset(colnames(new_weather_film_wk1), "abnormal_www")
abnormal_weather_wk1 <-
str_c(abnormal_weather_wk1_names, collapse = "+")
ols_glue <- glue("abnormal_viewership_wk1 ~ {abnormal_weather_wk1}")
ols_mod <- lm(as.formula(ols_glue),
new_weather_film_wk1)
#modelsummary(list(ols_mod), output = "gt")
17.b
#subset the data to include the variables of interest
leaps_data <- new_weather_film_wk1 |>
select(c(abnormal_viewership_wk1, all_of(abnormal_weather_wk1_names)))
forward <- regsubsets(abnormal_viewership_wk1 ~ .,
data = leaps_data, method = "forward")
## Reordering variables and trying again:
# Get summary of the models
summary_forward <- summary(forward)
# Find the index of the model with the highest R-squared Adjusted
best_model_index_fwd <- which.max(summary_forward$adjr2) #9th model has the highest
# Get the names of predictors (coef) in the best model (9), without the intercept([-1])
best_adjr_predictors <- names(coef(forward, id = best_model_index_fwd)[-1])
# Print the selected predictors and the corresponding R-squared Adjusted value
best_adjr_predictors
## [1] "abnormal_www_rain" "abnormal_www_mat5_60" "abnormal_www_mat5_85"
## [4] "abnormal_www_mat5_90" "abnormal_www_prec_1" "abnormal_www_cloud_0"
## [7] "abnormal_www_cloud_4" "abnormal_www_cloud_5" "abnormal_www_cloud_8"
#running regressions based on the model from foward (adj R^2)
regs_fwd <- str_c(best_adjr_predictors, collapse = " + ")
fwd_glue <- glue("abnormal_viewership_wk1 ~ {regs_fwd}")
fwd_mod <- lm(as.formula(fwd_glue), data = new_weather_film_wk1)
17.c
#only show the last steps (trace = 0)
backward <- step(ols_mod, direction = "backward",trace=0)
best_bkwd_predictors <- names(coefficients(backward)[-1])
best_bkwd_predictors
## [1] "abnormal_www_rain" "abnormal_www_mat5_45" "abnormal_www_mat5_55"
## [4] "abnormal_www_mat5_70" "abnormal_www_mat5_75" "abnormal_www_prec_0"
## [7] "abnormal_www_cloud_3" "abnormal_www_cloud_4" "abnormal_www_mat_la"
#running regressions based on the model from backward
regs_bkwd <- str_c(best_bkwd_predictors, collapse = " + ")
bkwd_glue <- glue("abnormal_viewership_wk1 ~ {regs_bkwd}")
bkwd_mod <- lm(as.formula(bkwd_glue), data = new_weather_film_wk1)
17.d
lasso_mod <- cv.glmnet(
x = as.matrix(new_weather_film_wk1 |>
select(all_of(abnormal_weather_wk1_names))),
y = new_weather_film_wk1 |>
pull(abnormal_viewership_wk1), #pull gets the numeric values
alpha = 1, # Lasso penalty
nfolds = 5 # 5 fold cross validation
)
new_weather_film_wk1 |>
mutate(pred = predict(lasso_mod, as.matrix(new_weather_film_wk1 |>
select(all_of(abnormal_weather_wk1_names))), s = "lambda.min"))
#movies <- read_csv("data/movie_lens_20m/movie.csv")
#ratings <- read_csv("data/movie_lens_20m/rating.csv")